{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:07:41.192577Z",
     "start_time": "2020-08-05T07:07:40.426259Z"
    }
   },
   "outputs": [],
   "source": [
    "import warnings\n",
    "from datetime import datetime, timedelta\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from scipy import stats\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
    "from sklearn.decomposition import TruncatedSVD\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "from collections import Counter\n",
    "import math\n",
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "from tqdm import tqdm\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "import gc\n",
    "import os\n",
    "\n",
    "import lightgbm as lgb\n",
    "from sklearn.model_selection import StratifiedKFold, KFold\n",
    "from sklearn.metrics import roc_auc_score\n",
    "from gensim.models import Word2Vec\n",
    "from sklearn.feature_selection import SelectPercentile, f_classif, chi2\n",
    "\n",
    "warnings.simplefilter('ignore')\n",
    "tqdm.pandas()\n",
    "%matplotlib inline\n",
    "\n",
    "pd.set_option('max_columns', None)\n",
    "pd.set_option('max_rows', None)\n",
    "pd.set_option('max_colwidth', 200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install pandas==0.25.3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:07:41.196919Z",
     "start_time": "2020-08-05T07:07:41.194545Z"
    }
   },
   "outputs": [],
   "source": [
    "seed = 2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:07:47.969140Z",
     "start_time": "2020-08-05T07:07:41.198026Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# 读取数据\n",
    "df_train_label = pd.read_csv('raw_data/train/train_label.csv')\n",
    "df_train_base = pd.read_csv('raw_data/train/train_base.csv')\n",
    "df_train_trans = pd.read_csv('raw_data/train/train_trans.csv')\n",
    "df_train_op = pd.read_csv('raw_data/train/train_op.csv')\n",
    "\n",
    "df_test_base = pd.read_csv('raw_data/test_b/testb_base.csv')\n",
    "df_test_trans = pd.read_csv('raw_data/test_b/testb_trans.csv')\n",
    "df_test_op = pd.read_csv('raw_data/test_b/testb_op.csv')\n",
    "\n",
    "df_trans = df_train_trans.append(df_test_trans)\n",
    "df_trans = df_trans.reset_index(drop=True)\n",
    "\n",
    "df_op = df_train_op.append(df_test_op)\n",
    "df_op = df_op.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:08:31.576714Z",
     "start_time": "2020-08-05T07:07:47.970444Z"
    }
   },
   "outputs": [],
   "source": [
    "def parse_time(tm):\n",
    "    days, _, time = tm.split(' ')\n",
    "    time = time.split('.')[0]\n",
    "\n",
    "    time = '2020-1-1 ' + time\n",
    "    time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')\n",
    "    time = (time + timedelta(days=int(days)))\n",
    "\n",
    "    return time\n",
    "\n",
    "\n",
    "df_trans['date'] = df_trans['tm_diff'].apply(parse_time)\n",
    "df_trans['day'] = df_trans['date'].dt.day\n",
    "df_trans['hour'] = df_trans['date'].dt.hour\n",
    "\n",
    "df_op['date'] = df_op['tm_diff'].apply(parse_time)\n",
    "df_op['day'] = df_op['date'].dt.day\n",
    "df_op['hour'] = df_op['date'].dt.hour"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:08:34.131987Z",
     "start_time": "2020-08-05T07:08:31.578002Z"
    }
   },
   "outputs": [],
   "source": [
    "df_trans.sort_values(['user', 'date'], inplace=True)\n",
    "df_trans = df_trans.reset_index(drop=True)\n",
    "\n",
    "df_op.sort_values(['user', 'date'], inplace=True)\n",
    "df_op = df_op.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:08:34.162347Z",
     "start_time": "2020-08-05T07:08:34.133279Z"
    }
   },
   "outputs": [],
   "source": [
    "df_train_base.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:08:34.305394Z",
     "start_time": "2020-08-05T07:08:34.164285Z"
    }
   },
   "outputs": [],
   "source": [
    "df_train = df_train_base.merge(df_train_label, how='left')\n",
    "df_test = df_test_base\n",
    "\n",
    "df_feature = df_train.append(df_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:08:34.311400Z",
     "start_time": "2020-08-05T07:08:34.306977Z"
    }
   },
   "outputs": [],
   "source": [
    "def select_feature(df, select_feature, ycol, p):\n",
    "    X = df[select_feature]\n",
    "    X.fillna(0, inplace=True)\n",
    "    Y = df[ycol]\n",
    "\n",
    "    selectChi2 = SelectPercentile(chi2, percentile=p).fit(X, Y)\n",
    "    selectF_classif = SelectPercentile(f_classif, percentile=p).fit(X, Y)\n",
    "\n",
    "    chi2_selected = selectChi2.get_support()\n",
    "    print('Chi2 selected {} features.'.format(chi2_selected.sum()))\n",
    "    f_classif_selected = selectF_classif.get_support()\n",
    "    print('F_classif selected {} features.'.format(f_classif_selected.sum()))\n",
    "    selected = chi2_selected & f_classif_selected\n",
    "    print('Chi2 & F_classif selected {} features'.format(selected.sum()))\n",
    "    selected_features = [f for f, s in zip(select_feature, selected) if s]\n",
    "\n",
    "    del_features = list(set(select_feature) - set(selected_features))\n",
    "    del_features.sort()\n",
    "    return del_features"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-20T02:42:35.266931Z",
     "start_time": "2020-07-20T02:42:35.263531Z"
    }
   },
   "source": [
    "# 特征工程"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:08:34.320566Z",
     "start_time": "2020-08-05T07:08:34.312481Z"
    },
    "code_folding": [
     4
    ]
   },
   "outputs": [],
   "source": [
    "os.makedirs('model', exist_ok=True)\n",
    "os.makedirs('embedding', exist_ok=True)\n",
    "\n",
    "\n",
    "def w2v_emb(df, f1, f2, prefix):\n",
    "    emb_size = 32\n",
    "\n",
    "    model_path = 'model/{}_w2v_{}_{}_{}.m'.format(prefix, f1, f2, emb_size)\n",
    "    embedding_path = 'embedding/{}_{}_{}_{}.pkl'.format(prefix, f1, f2, emb_size)\n",
    "\n",
    "    if os.path.exists(embedding_path):\n",
    "        embedding = pd.read_pickle(embedding_path)\n",
    "        return embedding\n",
    "\n",
    "    tmp = df.groupby(f1, as_index=False)[f2].agg(\n",
    "        {'{}_{}_list'.format(f1, f2): list})\n",
    "\n",
    "    sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist()\n",
    "    del tmp['{}_{}_list'.format(f1, f2)]\n",
    "    for i in range(len(sentences)):\n",
    "        sentences[i] = [str(x) for x in sentences[i]]\n",
    "\n",
    "    if os.path.exists(model_path):\n",
    "        model = Word2Vec.load(model_path)\n",
    "    else:\n",
    "        model = Word2Vec(sentences,\n",
    "                         size=emb_size,\n",
    "                         window=5,\n",
    "                         min_count=5,\n",
    "                         sg=0,\n",
    "                         hs=1,\n",
    "                         seed=seed)\n",
    "        model.save(model_path)\n",
    "\n",
    "    emb_matrix = []\n",
    "    for seq in sentences:\n",
    "        vec = []\n",
    "        for w in seq:\n",
    "            if w in model:\n",
    "                vec.append(model[w])\n",
    "        if len(vec) > 0:\n",
    "            emb_matrix.append(np.mean(vec, axis=0))\n",
    "        else:\n",
    "            emb_matrix.append([0] * emb_size)\n",
    "\n",
    "    df_emb = pd.DataFrame(emb_matrix)\n",
    "    df_emb.columns = [\n",
    "        '{}_{}_{}_emb_{}'.format(prefix, f1, f2, i) for i in range(emb_size)\n",
    "    ]\n",
    "\n",
    "    embedding = pd.concat([tmp, df_emb], axis=1)\n",
    "    embedding.to_pickle(embedding_path)\n",
    "\n",
    "    return embedding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:08:34.389106Z",
     "start_time": "2020-08-05T07:08:34.321716Z"
    },
    "code_folding": []
   },
   "outputs": [],
   "source": [
    "def tfidf_emb(df, f1, f2, prefix):\n",
    "    emb_size = 32\n",
    "\n",
    "    df[f2] = df[f2].astype(str)\n",
    "    df[f2].fillna('-1', inplace=True)\n",
    "    group_df = df.groupby([f1]).apply(\n",
    "        lambda x: x[f2].tolist()).reset_index()\n",
    "    group_df.columns = [f1, 'list']\n",
    "    group_df['list'] = group_df['list'].apply(lambda x: ','.join(x))\n",
    "    enc_vec = TfidfVectorizer()\n",
    "    tfidf_vec = enc_vec.fit_transform(group_df['list'])\n",
    "    svd_enc = TruncatedSVD(n_components=emb_size, n_iter=20, random_state=seed)\n",
    "    vec_svd = svd_enc.fit_transform(tfidf_vec)\n",
    "    vec_svd = pd.DataFrame(vec_svd)\n",
    "    vec_svd.columns = ['{}_svd_tfidf_{}_{}'.format(prefix,\n",
    "        f2, i) for i in range(emb_size)]\n",
    "    group_df = pd.concat([group_df, vec_svd], axis=1)\n",
    "    del group_df['list']\n",
    "    return group_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:08:34.395748Z",
     "start_time": "2020-08-05T07:08:34.390425Z"
    }
   },
   "outputs": [],
   "source": [
    "def countvec_emb(df, f1, f2):\n",
    "    emb_size = 32\n",
    "\n",
    "    df[f2] = df[f2].astype(str)\n",
    "    df[f2].fillna('-1', inplace=True)\n",
    "    group_df = df.groupby([f1]).apply(\n",
    "        lambda x: x[f2].tolist()).reset_index()\n",
    "    group_df.columns = [f1, 'list']\n",
    "    group_df['list'] = group_df['list'].apply(lambda x: ','.join(x))\n",
    "    enc_vec = CountVectorizer()\n",
    "    tfidf_vec = enc_vec.fit_transform(group_df['list'])\n",
    "    svd_enc = TruncatedSVD(n_components=emb_size, n_iter=20, random_state=seed)\n",
    "    vec_svd = svd_enc.fit_transform(tfidf_vec)\n",
    "    vec_svd = pd.DataFrame(vec_svd)\n",
    "    vec_svd.columns = ['svd_countvec_{}_{}'.format(\n",
    "        f2, i) for i in range(emb_size)]\n",
    "    group_df = pd.concat([group_df, vec_svd], axis=1)\n",
    "    del group_df['list']\n",
    "    return group_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_trend_feature(arr, abs_values=False):\n",
    "    idx = np.array(range(len(arr)))\n",
    "    if abs_values:\n",
    "        arr = np.abs(arr)\n",
    "    lr = LinearRegression()\n",
    "    lr.fit(idx.reshape(-1, 1), arr)\n",
    "    return lr.coef_[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 操作信息特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_op['date_diff'] = df_op.groupby('user')['date'].diff()\n",
    "df_op['op_second_diff'] = df_op['date_diff'].dt.seconds\n",
    "df_op['op_hour_diff'] = df_op['op_second_diff'] / 3600\n",
    "df_op['op_day_diff'] = df_op['op_hour_diff'] / 24"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:08:34.476009Z",
     "start_time": "2020-08-05T07:08:34.396830Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df_op.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for window in [15, 3, 5]:\n",
    "    for col in tqdm(['op_type', 'op_mode', 'net_type', 'channel', 'hour']):\n",
    "        df_temp = df_op[df_op['day'] > 15 - window][['user', col]].copy()\n",
    "        df_temp['tmp'] = 1\n",
    "        df_temp = df_temp.pivot_table(index='user', columns=col,\n",
    "                                      values='tmp', aggfunc=np.sum).reset_index().fillna(0)\n",
    "        df_temp.columns = [c if c == 'user' else 'op_{}_{}_count_{}d'.format(\n",
    "            col, c, window) for c in df_temp.columns]\n",
    "        df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:09:04.247876Z",
     "start_time": "2020-08-05T07:09:03.147363Z"
    }
   },
   "outputs": [],
   "source": [
    "for col in tqdm(['op_type', 'op_mode', 'net_type', 'channel']):\n",
    "    df_temp = df_op[['user', 'hour', col]].copy()\n",
    "    df_temp = df_temp.pivot_table(index='user', columns=col,\n",
    "                                  values='hour', aggfunc=['mean', 'std', 'max', 'min']).fillna(0)\n",
    "    df_temp.columns = ['op_{}_{}_hour_{}'.format(col, f[1], f[0]) for f in df_temp.columns]\n",
    "    df_temp.reset_index(inplace=True)\n",
    "    df_temp.rename({'index': 'user'}, inplace=True, axis=1)\n",
    "    df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:09:04.871719Z",
     "start_time": "2020-08-05T07:09:04.249445Z"
    }
   },
   "outputs": [],
   "source": [
    "df_temp = df_op.groupby(['user', 'op_device']).size().reset_index()\n",
    "df_temp.drop([0], axis=1, inplace=True)\n",
    "df_temp = df_temp.sort_values(\n",
    "    by=['user', 'op_device'], ascending=['asc', 'asc'])\n",
    "df_temp.drop_duplicates('user', keep='last', inplace=True)\n",
    "df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:09:05.548655Z",
     "start_time": "2020-08-05T07:09:04.873012Z"
    }
   },
   "outputs": [],
   "source": [
    "for f in ['hour', 'day', 'op_second_diff']:\n",
    "    df_temp = df_op.groupby('user')[f].agg({\n",
    "        'op_{}_mean'.format(f): 'mean', \n",
    "        'op_{}_std'.format(f): 'std', \n",
    "        'op_{}_max'.format(f): 'max',\n",
    "        'op_{}_min'.format(f): 'min'\n",
    "    }).reset_index()\n",
    "    df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:09:21.476090Z",
     "start_time": "2020-08-05T07:09:21.240636Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 交易信息特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trans['date_diff'] = df_trans.groupby('user')['date'].diff()\n",
    "df_trans['trans_second_diff'] = df_trans['date_diff'].dt.seconds\n",
    "df_trans['trans_hour_diff'] = df_trans['trans_second_diff'] / 3600\n",
    "df_trans['trans_day_diff'] = df_trans['trans_hour_diff'] / 24"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trans['amount'] = np.log1p(df_trans['amount'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:09:21.488224Z",
     "start_time": "2020-08-05T07:09:21.477296Z"
    }
   },
   "outputs": [],
   "source": [
    "df_trans.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for col in tqdm(['platform', 'tunnel_in', 'tunnel_out', 'type1', 'type2', 'hour']):\n",
    "    df_temp = df_trans.pivot_table(\n",
    "        index='user', columns=col, values='amount', aggfunc=['sum', 'mean', 'max', 'min', 'std', 'median']).fillna(0)\n",
    "    df_temp.columns = ['trans_{}_{}_amount_{}'.format(col, f[1], f[0]) for f in df_temp.columns]\n",
    "    df_temp.reset_index(inplace=True)\n",
    "    df_temp.rename({'index': 'user'}, inplace=True, axis=1)\n",
    "\n",
    "    df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:09:25.511709Z",
     "start_time": "2020-08-05T07:09:24.236942Z"
    }
   },
   "outputs": [],
   "source": [
    "for window in tqdm([31, 1, 3, 5, 7, 10, 15]):\n",
    "    df_temp = df_trans[df_trans['day'] > 31-window].groupby('user')['amount'].agg({\n",
    "        'trans_amount_mean_{}d'.format(window): 'mean',\n",
    "        'trans_amount_std_{}d'.format(window): 'std',\n",
    "        'trans_amount_max_{}d'.format(window): 'max',\n",
    "        'trans_amount_min_{}d'.format(window): 'min',\n",
    "        'trans_amount_sum_{}d'.format(window): 'sum',\n",
    "    }).reset_index()\n",
    "    df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:09:26.287488Z",
     "start_time": "2020-08-05T07:09:25.513091Z"
    }
   },
   "outputs": [],
   "source": [
    "for window in [3, 5, 10]:\n",
    "    for col in ['type1', 'type2']:\n",
    "        df_temp = df_trans[df_trans['day'] > 31 - window].pivot_table(\n",
    "            index='user', columns=col, values='amount', aggfunc=['sum']).fillna(0)\n",
    "        df_temp.columns = ['trans_{}_{}_amount_{}_{}d'.format(col, f[1], f[0], window) for f in df_temp.columns]\n",
    "        df_temp.reset_index(inplace=True)\n",
    "        df_temp.rename({'index': 'user'}, inplace=True, axis=1)\n",
    "        \n",
    "        df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:09:26.769652Z",
     "start_time": "2020-08-05T07:09:26.288861Z"
    }
   },
   "outputs": [],
   "source": [
    "for f in ['ip', 'ip_3']:    \n",
    "    df_temp = df_trans.groupby(['user'])[f].agg({\n",
    "        'trans_{}_count'.format(f): 'count'\n",
    "    }).reset_index()\n",
    "    df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for f in ['hour', 'trans_day_diff']:\n",
    "    df_temp = df_trans.groupby('user')[f].agg({\n",
    "        'trans_{}_mean'.format(f): 'mean', \n",
    "        'trans_{}_std'.format(f): 'std',\n",
    "    }).reset_index()\n",
    "    df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:09:41.917548Z",
     "start_time": "2020-08-05T07:09:41.552529Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-07-20T08:57:03.376937Z",
     "start_time": "2020-07-20T08:57:03.373425Z"
    }
   },
   "source": [
    "## 基本信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:09:42.416684Z",
     "start_time": "2020-08-05T07:09:41.918761Z"
    }
   },
   "outputs": [],
   "source": [
    "for f in [\n",
    "        'balance', 'balance_avg', 'balance1', 'balance1_avg', 'balance2',\n",
    "        'balance2_avg', 'product1_amount', 'product2_amount',\n",
    "        'product3_amount', 'product4_amount', 'product5_amount', 'product6_amount'\n",
    "]:\n",
    "    df_feature[f] = df_feature[f].apply(lambda x: int(\n",
    "        x.split(' ')[1]) if type(x) != float else np.NaN)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cate_features = ['sex', 'provider', 'level', 'verified', 'regist_type', 'agreement1', 'agreement2', 'agreement3', 'agreement4', 'province', 'city', 'service3', \n",
    "                 'service3_level']\n",
    "\n",
    "for f1 in tqdm(cate_features):\n",
    "    for f2 in cate_features:\n",
    "        df_feature['{}_{}'.format(f1, f2)] = df_feature[f1] + '_' + df_feature[f2]\n",
    "        \n",
    "for f in tqdm(cate_features):\n",
    "    df_feature['{}_cnt'.format(f)] = df_feature.groupby([f])['user'].transform('count')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dense_features = ['age', 'using_time', 'card_a_cnt', 'card_b_cnt', 'card_c_cnt', 'card_d_cnt', 'op1_cnt', 'op2_cnt', 'service1_cnt', 'service1_amt', 'service2_cnt', \n",
    "                  'agreement_total', 'acc_count', 'login_cnt_period1', 'login_cnt_period2', 'ip_cnt', 'login_cnt_avg', 'login_days_cnt', 'balance', 'balance_avg', \n",
    "                  'balance1', 'balance1_avg', 'balance2', 'balance2_avg', 'product1_amount', 'product2_amount', 'product3_amount', 'product4_amount', 'product5_amount',\n",
    "                 'product6_amount', 'product7_cnt', 'product7_fail_cnt']\n",
    "\n",
    "min_max = MinMaxScaler()\n",
    "df_feature[dense_features] = min_max.fit_transform(df_feature[dense_features].values)\n",
    "\n",
    "for f1 in tqdm(dense_features):\n",
    "    for f2 in dense_features:\n",
    "        if f1 != f2:\n",
    "            df_feature['{}_add_{}'.format(f1, f2)] = df_feature[f1] + df_feature[f2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:09:42.429451Z",
     "start_time": "2020-08-05T07:09:42.418098Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature['product7_fail_ratio'] = df_feature[\n",
    "    'product7_fail_cnt'] / df_feature['product7_cnt']\n",
    "df_feature['card_cnt'] = df_feature['card_a_cnt'] + df_feature[\n",
    "    'card_b_cnt'] + df_feature['card_c_cnt'] + df_feature['card_d_cnt']\n",
    "\n",
    "df_feature['acc_card_ratio'] = df_feature['acc_count'] / df_feature['card_cnt']\n",
    "df_feature['login_cnt'] = df_feature['login_cnt_period1'] + \\\n",
    "    df_feature['login_cnt_period2']\n",
    "\n",
    "df_feature['login_cnt_period2_login_cnt_ratio'] = df_feature['login_cnt_period2'] / \\\n",
    "    df_feature['login_cnt']\n",
    "df_feature['login_cnt_period1_login_cnt_ratio'] = df_feature['login_cnt_period1'] / \\\n",
    "    df_feature['login_cnt']\n",
    "\n",
    "df_feature['using_time_op2_cnt_ratio'] = df_feature['using_time'] / \\\n",
    "    df_feature['op2_cnt']\n",
    "df_feature['using_time_op1_cnt_ratio'] = df_feature['using_time'] / \\\n",
    "    df_feature['op1_cnt']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:09:50.894821Z",
     "start_time": "2020-08-05T07:09:42.431123Z"
    },
    "code_folding": []
   },
   "outputs": [],
   "source": [
    "# 欺诈率\n",
    "def stat(df, df_merge, group_by, agg):\n",
    "    group = df.groupby(group_by).agg(agg)\n",
    "\n",
    "    columns = []\n",
    "    for on, methods in agg.items():\n",
    "        for method in methods:\n",
    "            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))\n",
    "    group.columns = columns\n",
    "    group.reset_index(inplace=True)\n",
    "    df_merge = df_merge.merge(group, on=group_by, how='left')\n",
    "\n",
    "    del (group)\n",
    "    gc.collect()\n",
    "    return df_merge\n",
    "\n",
    "\n",
    "def statis_feat(df_know, df_unknow):\n",
    "    df_unknow = stat(df_know, df_unknow, ['province'], {'label': ['mean']})\n",
    "    df_unknow = stat(df_know, df_unknow, ['city'], {'label': ['mean']})\n",
    "    df_unknow = stat(df_know, df_unknow, [\n",
    "                     'city', 'level'], {'label': ['mean']})\n",
    "    df_unknow = stat(df_know, df_unknow, ['op_device'], {'label': ['mean']})\n",
    "    df_unknow = stat(df_know, df_unknow, [\n",
    "                     'age', 'op_device'], {'label': ['mean']})\n",
    "    df_unknow = stat(df_know, df_unknow, ['using_time'], {'label': ['mean']})\n",
    "    df_unknow = stat(df_know, df_unknow, [\n",
    "                     'city', 'op_device'], {'label': ['mean']})\n",
    "    df_unknow = stat(df_know, df_unknow, ['age', 'city'], {'label': ['mean']})\n",
    "    df_unknow = stat(df_know, df_unknow, [\n",
    "                     'op_device', 'level'], {'label': ['mean']})\n",
    "\n",
    "    return df_unknow\n",
    "\n",
    "\n",
    "df_train = df_feature[~df_feature['label'].isnull()]\n",
    "df_train = df_train.reset_index(drop=True)\n",
    "df_test = df_feature[df_feature['label'].isnull()]\n",
    "\n",
    "df_stas_feat = None\n",
    "kf = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)\n",
    "for train_index, val_index in kf.split(df_train, df_train['label']):\n",
    "    df_fold_train = df_train.iloc[train_index]\n",
    "    df_fold_val = df_train.iloc[val_index]\n",
    "\n",
    "    df_fold_val = statis_feat(df_fold_train, df_fold_val)\n",
    "    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)\n",
    "\n",
    "    del (df_fold_train)\n",
    "    del (df_fold_val)\n",
    "    gc.collect()\n",
    "\n",
    "df_test = statis_feat(df_train, df_test)\n",
    "df_feature = pd.concat([df_stas_feat, df_test], axis=0)\n",
    "df_feature = df_feature.reset_index(drop=True)\n",
    "\n",
    "del (df_stas_feat)\n",
    "del (df_train)\n",
    "del (df_test)\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for f in tqdm(cate_features):\n",
    "    for f2 in dense_features:\n",
    "        df_feature['{}_{}_mean'.format(f, f2)] = df_feature.groupby([f])[f2].transform('mean')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T07:09:53.425571Z",
     "start_time": "2020-08-05T07:09:51.599883Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature.to_pickle('data/feature1.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
